library(mdsr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.0 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
data("CIACountries")
plot(CIACountries$gdp,
CIACountries$educ)
Add axis labels
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education")
Change the shape
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16)
Change the color
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red")
Change the x and y limits
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red",
xlim = c(0,150000),
ylim = c(0,15))
Change the x and y limits
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red",
xlim = c(0,150000),
ylim = c(0,15))
Aspect Ratio
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red",
xlim = c(0,150000),
ylim = c(0,15),
asp = 10000)
Change axis labels
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red",
xlim = c(0,150000),
ylim = c(0,15),
asp = 10000,
xaxt = 'n',
yaxt = 'n')
axis(1, c(0,100000,200000), c("None","A little","A lot"))
axis(2, c(0,5,10,15), c("None","Ok","Good","Great"))
Size
plot(CIACountries$gdp,
CIACountries$educ,
xlab = "GDP",
ylab = "Education",
pch = 16,
col = "red",
xlim = c(0,150000),
ylim = c(0,15),
asp = 10000,
xaxt = 'n',
yaxt = 'n',
cex = 0.5)
axis(1, c(0,100000,200000), c("None","A little","A lot"))
axis(2, c(0,5,10,15), c("None","Ok","Good","Great"))
The key to using ggplot to is to think of ever command as a layer!
library(ggplot2)
ggplot(aes(x = gdp, y = educ),data = CIACountries) + geom_point()
## Warning: Removed 64 rows containing missing values (geom_point).
#CIACountries %>% ggplot(aes(x = gdp, y = educ)) + geom_point()
Add axis labels
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point() +
xlab("GDP") +
ylab("Education")
## Warning: Removed 64 rows containing missing values (geom_point).
Change the shape
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23) +
xlab("GDP") +
ylab("Education")
## Warning: Removed 64 rows containing missing values (geom_point).
Change the color
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23, colour = "red") +
xlab("GDP") +
ylab("Education")
## Warning: Removed 64 rows containing missing values (geom_point).
Change the x and y limits
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23, colour = "red") +
xlab("GDP") +
ylab("Education") +
xlim(0,200000) +
ylim(0,15)
## Warning: Removed 64 rows containing missing values (geom_point).
Aspect Ratio
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23, colour = "red") +
xlab("GDP") +
ylab("Education") +
xlim(0,200000) +
ylim(0,15) +
coord_fixed(ratio=10000)
## Warning: Removed 64 rows containing missing values (geom_point).
Change axis values
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23, colour = "red") +
xlab("GDP") +
ylab("Education") +
xlim(0,200000) +
ylim(0,15) +
coord_fixed(ratio=10000) +
scale_x_continuous(breaks = c(0,100000,200000),
labels = c("None","A little","A lot"),
limits = c(0,200000)) +
scale_y_continuous(breaks = c(0,5,10,15),
labels = c("None","Ok","Good","Great"),
limits = c(0,15)
)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 64 rows containing missing values (geom_point).
#Note: scale_x_continuous overrules the xlim!
Size
ggplot(aes(x = gdp, y = educ), data = CIACountries) +
geom_point(shape = 23, colour = "red", size = 0.5) +
xlab("GDP") +
ylab("Education") +
xlim(0,200000) +
ylim(0,15) +
coord_fixed(ratio=10000) +
scale_x_continuous(breaks = c(0,100000,200000),
labels = c("None","A little","A lot"),
limits = c(0,200000)) +
scale_y_continuous(breaks = c(0,5,10,15),
labels = c("None","Ok","Good","Great"),
limits = c(0,15)
)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 64 rows containing missing values (geom_point).
library(mdsr)
data("SAT_2010")
#Base R
hist(SAT_2010$math)
#ggplot
ggplot(aes(x = math), data = SAT_2010) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#gggplot with breaks
ggplot(aes(x = math), data = SAT_2010) + geom_histogram(breaks = c(400,450,550,600,700))
#Base R
dens <- density(SAT_2010$math)
plot(dens$x, dens$y, type = "l")
#ggplot
#This is a lot cleaner code and nicer looking!
ggplot(aes(x = math), data = SAT_2010) + geom_density()
#More jagged
ggplot(aes(x = math), data = SAT_2010) + geom_density(bw = 0.01)
sub <- head(SAT_2010, 10)
sub <- sub[order(sub$math),]
barplot(sub$math, names.arg = sub$state)
#ggplot
ggplot(
data = head(SAT_2010, 10),
aes(x = reorder(state, math), y = math)
) +
geom_col() +
labs(x = "State", y = "Average math SAT score")
### Stacked bar chart
library(mosaicData)
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) +
geom_bar(aes(fill = substance), position = "fill")
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) +
geom_bar(aes(fill = substance), position = "fill") +
scale_fill_brewer(palette = "Spectral")
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) +
geom_bar(aes(fill = substance), position = "fill") +
scale_fill_brewer(palette = "Spectral") +
coord_flip()
g <- ggplot(
data = SAT_2010,
aes(x = expenditure, y = math)
) +
geom_point()
We can easily add a trend line with ggplot.
g +
geom_smooth(method = "loess", se = FALSE) +
xlab("Average expenditure per student ($1000)") +
ylab("Average score on math SAT")
## `geom_smooth()` using formula 'y ~ x'
To do this in base R you have to do this:
plot(SAT_2010$expenditure, SAT_2010$math)
a <- loess(SAT_2010$math ~ SAT_2010$expenditure)
points(a$x[order(a$x)], a$fitted[order(a$x)], type = "l", col = "red")
SAT_2010 <- SAT_2010 %>%
mutate(
SAT_rate = cut(
sat_pct,
breaks = c(0, 30, 60, 100),
labels = c("low", "medium", "high")
)
)
#g <- g %+% SAT_2010
#g + aes(color = SAT_rate)
#To this in base R is possible, but it is a pain!
#Full code:
ggplot(data = SAT_2010, aes(x = expenditure, y = math)) +
geom_point(aes(color = SAT_rate)) + geom_smooth(aes(color = SAT_rate), method = "lm", se = FALSE) + scale_colour_brewer(palette = "Spectral")
## `geom_smooth()` using formula 'y ~ x'
#Base R
par(mfrow = c(1, 3))
plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "low"],
SAT_2010$math[SAT_2010$SAT_rate == "low"],
xlab = "expenditure",
ylab = "math score")
plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "medium"],
SAT_2010$math[SAT_2010$SAT_rate == "medium"],
xlab = "expenditure",
ylab = "math score")
plot(SAT_2010$expenditure[SAT_2010$SAT_rate == "high"],
SAT_2010$math[SAT_2010$SAT_rate == "high"],
xlab = "expenditure",
ylab = "math score")
So much easier in ggplot!
#g + facet_wrap( ~SAT_rate )
#g + facet_grid(~ SAT_rate )
#full code
SAT_2010 <- SAT_2010 %>%
mutate(SAT_rate = cut(
sat_pct,
breaks = c(0, 30, 60, 100),
labels = c("low", "medium", "high")
))
g <- ggplot(data = SAT_2010,
aes(x = expenditure, y = math)) +
geom_point() + geom_smooth(method = "lm") + facet_wrap( ~ SAT_rate)
g
## `geom_smooth()` using formula 'y ~ x'
NHANES example
library(NHANES)
ggplot(
data = slice_sample(NHANES, n = 1000),
aes(x = Age, y = Height, color = Gender)
) +
geom_point() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)") +
labs(color = "Gender")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 38 rows containing non-finite values (stat_smooth).
## Warning: Removed 38 rows containing missing values (geom_point).
#relevel the reference category
library(NHANES)
ggplot(
data = slice_sample(NHANES, n = 1000),
aes(x = Age, y = Height, color = fct_relevel(Gender, "male"))
) +
geom_point() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)") +
labs(color = "Gender")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (stat_smooth).
## Warning: Removed 39 rows containing missing values (geom_point).
library(macleish)
## Loading required package: etl
ggplot(data = whately_2015, aes(x = when, y = temperature)) +
geom_line(color = "darkgray") +
geom_smooth() +
xlab(NULL) +
ylab("Temperature (degrees Celsius)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# whately_2015 %>%
# mutate(month = as.factor(lubridate::month(when, label = TRUE))) %>%
# group_by(month) %>%
# skim(temperature) %>%
# select(-na)
whately_2015$month <-
as.factor(lubridate::month(whately_2015$when, label = TRUE))
#Base R
boxplot(
whately_2015$temperature ~ whately_2015$month,
xlab = "Month",
ylab = expression("Temperature (" * ~ degree * C * ")")
)
#ggplot
ggplot(
data = whately_2015,
aes(
x = lubridate::month(when, label = TRUE),
y = temperature
)
) +
geom_boxplot() +
xlab("Month") +
ylab(expression("Temperature ("*~degree*C*")"))
Check out the extended example here: https://mdsr-book.github.io/mdsr2e/ch-vizII.html#sec:babynames
Make these plots!
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'